# ------------------------------
# ' recreate "sameness" index at PUMA or county levels
# ' this code can create yearly measures as well as overall measures
# ------------------------------

# custom function to measure SD for weighted sd
weighted.sd <- function(x, w = NULL, na.rm = FALSE) {
	if (na.rm) {
		na <- is.na(x) | is.na(w)
		x <- x[!na]
		w <- w[!na]
	}
	weighted_sd = sqrt(sum(w * (x - weighted.mean(x, w)) ^ 2) / (sum(w) - 1))
	return(weighted_sd)
}

#------------------------------------------------------------------------------
# load processed data : 
rdata = read_fst(file.path(processed_path,paste0('merged_county_v1.fst')), as.data.table=TRUE)

#------------------------------------------------------------------------------
# Data Preprocessing
rdata[, Race5 := Race4]
rdata[Hisp == 1, Race5 := 5] # Hispanic or other races
rdata[, Hisp := NULL]

rdata[, MarStat5 := MarStat6]
rdata[MarStat5 == 6, MarStat5 := 5]
rdata[, MarStat6 := NULL]

rdata[, St := state]
setnames(rdata,'DSID','Suic')


# drop younger than 15
rdata = rdata[AgeGrp4 != 0,]

# drop missing states
rdata = rdata[!is.na(St),]

# select only available Year
rdata = rdata[Year >= 2005,]

# only select time-series indicators
list_var = c('Sex','AgeGrp4','Race5','MarStat5','BornUSA','UnEmpl','PhysProb')

rdata[,c('Age','Race4','Hisp') := NULL]

controls = c('Rat_Poverty', 'Rat_Mig_Cum' ,'Pop_Den', 
	'Rat_GC_ProE', 'Rat_GC_ProM' ,'Rat_GC_ProB','Rat_GC_Cath','Rat_GC_Jew','Rat_GC_Oth')

export(rdata[,c('uid','Year','county',controls),with=FALSE], 
	file.path(processed_path, paste0('suicide_reg_','R','_v1.dta')))

ind_rdata = rdata[,c('county','Year','uid', 'Suic', 'Sex','AgeGrp4','MarStat5','Race5','BornUSA','UnEmpl','PhysProb','county_weight','St')]
ind_rdata = ind_rdata[!is.na(Sex) & !is.na(AgeGrp4) & !is.na(Race5) & !is.na(county) & !is.na(Year),]

# merge two data sets
for (yy in 2005:2011){

	outfile = file.path(processed_path, paste0('suicide_reg_','Y',yy,'_v1.dta'))

	if (file.exists(outfile)) {
	
	data = ind_rdata[Year==yy,]

	# load imputed data by year
	mdata = read_fst(file.path(processed_path,'imputed_by_year',paste0('imputed_M10_',yy,'.fst')), as.data.table=TRUE)
	for (var in c(list_var,'Year','county')) mdata[[var]] = as.integer(as.character(mdata[[var]]))
	mdata[, county := stringr::str_pad(county, width=5, pad='0')]
	setnames(mdata,'.imp','mm')

	data = merge(mdata, data[,c('uid','Year','county','St',paste0('county','_weight')),with=FALSE], 
		by=c('uid','Year','county'),all.x=TRUE)		
	data$ObsWgt0 = data[[paste0('county','_weight')]]		
	
	list_var = c('Sex','AgeGrp4','Race5','MarStat5','BornUSA', 'UnEmpl','PhysProb')
	
	#==============================================================================
	# recreate measures for % same category in their community
	for (var in list_var){
		prop_data = data[, .(N=sum(ObsWgt0)), by = c('county', var)][, prop := 100 * N / sum(N), by = c('county')]
		prop_data = na.omit(prop_data)
	
		# add standardized variables by each category
		prop_data[,mean_prop := mean(prop), by=var] # overall mean
		prop_data[,sd_prop := sd(prop), by=var] # overall sd 
		prop_data[,std_prop := (prop - mean_prop) / sd_prop]
		prop_data[, `:=`(N = NULL, mean_prop = NULL, sd_prop = NULL)]
	
		data = merge(x = data, y = prop_data, by=c('county',var),all.x=TRUE)
			setnames(data, 'prop', paste0('same_prop_',var))
			setnames(data, 'std_prop', paste0('std_same_prop_',var))
	}
	
	# generate some proportion variables : gender/age/race again
	data[, Female := as.integer(Sex == 2)]
	data[, RAT_Female := 100 * weighted.mean(Female, ObsWgt0, na.rm=TRUE), by='county']
	
	var_value = sort(unique(data[["AgeGrp4"]]))
	for (i in var_value) {
		mean_data = data[, weighted.mean(AgeGrp4 == i, ObsWgt0, na.rm=TRUE), by='county']
		mean_data[,V1 := 100 * V1]
		setnames(mean_data, 'V1',paste0('RAT_AgeGrp4_',i))
		data = merge(x = data, y = mean_data, by=c('county'), all.x=TRUE)
	}
	
	var_value = sort(unique(data[["Race5"]]))
	for (i in var_value) {
		mean_data = data[, weighted.mean(Race5 == i, ObsWgt0, na.rm=TRUE), by='county']
		mean_data[,V1 := 100 * V1]
		setnames(mean_data, 'V1',paste0('RAT_Race5_',i))
		data = merge(x = data, y = mean_data, by=c('county'), all.x=TRUE)
	}
	
	var_value = sort(unique(data[["MarStat5"]]))
	for (i in var_value) {
		mean_data = data[, weighted.mean(MarStat5 == i, ObsWgt0, na.rm=TRUE), by='county']
		mean_data[,V1 := 100 * V1]
		setnames(mean_data, 'V1',paste0('RAT_MarStat5_',i))
		data = merge(x = data, y = mean_data, by=c('county'), all.x=TRUE)
	}
	
	data[, RAT_BornUSA := 100 * weighted.mean(BornUSA, ObsWgt0, na.rm=TRUE), by='county']
	data[, RAT_UnEmpl := 100 * weighted.mean(UnEmpl, ObsWgt0, na.rm=TRUE), by='county']

	keepcols = c('mm','uid','ObsWgt0','county','Year','Suic','Female','St',
		list_var, 
		grep('same_',names(data),value=TRUE),
		grep('RAT_',names(data),value=TRUE)
		)
	keep_data = data[,..keepcols]
	
	export(keep_data, outfile)
	rm(keep_data,data); gc()
	}	
}



